### supplemental material to:
### Michael Kloster, Daniel Langenkämper, Martin Zurowietz, Bánk Beszteri, Tim W. Nattkemper (2020)  
### Deep learning-based diatom taxonomy on virtual slides

### analysis of DL experiment results

## required libraries
library(tidyr)
library(ggplot2)

## process evaluation data

## data preprocessing
# read data, evaluation of all results from all experiments
data <- read.csv2("./results/F1 scores from all experiments.csv")

# build data columns form experiment name
dataSetup <- data %>% separate(Project.Name,c ("CNN", "Experiment", "DataSets", "Portion", "Background", "Run"), sep="\\.", remove = FALSE)
dataSetup <- dataSetup %>% separate(DataSets, c("DataTraining", "DataTest"), sep="on")
dataSetup$OutOfSet <- dataSetup$DataTraining != dataSetup$DataTest
dataSetup<- dataSetup %>% unite("Setup", c ("CNN", "Experiment", "DataTraining", "DataTest", "Portion", "Background"), remove=FALSE)

# factorize and order factors (for setting base-line of ANOVA)
dataSetup$CNN = factor(dataSetup$CNN, levels=c("VGG16_1FC","VGG16_2FC" ,"VGG19_1FC", "VGG19_2FC", "Xception", "DenseNet", "InceptionResNetv2", "MobileNetV2", "InceptionV3"))
dataSetup$Background = factor(dataSetup$Background, levels=c("masked", "unmasked"))
dataSetup$Portion = factor(dataSetup$Portion, levels=c("100p", "10p"))

# subset considering only results from best performing CNN model "VGG16_1FC"
data.VGG16_1FC <- dataSetup[dataSetup$CNN == "VGG16_1FC",]


### Tables

## Table 5: Classification performance of different models, average over experiments 1 - 16, calculated by ANOVA

# micro F1
# base-line is VGG16_1FC
model.CNN <- lm (formula = Micro.F1 ~ CNN, data = dataSetup)
summary(model.CNN)

# macro F1
# base-line is VGG16_1FC
model.CNN <- lm (formula = Macro.F1 ~ CNN, data = dataSetup)
summary(model.CNN)


## Table 6: Classification performance per class for the initial experiment AB100,-|AB100,-

# see file "./results/Detailed results VGG16_1FC.Exp0.ABonAB.unmasked.txt", which is the console output from the initial experiment


## Table 7: Deep learning experiments, results for the best performing model “VGG16_1FC”
data.VGG16_1FC <- dataSetup[dataSetup$CNN=="VGG16_1FC",]
resultsTable.VGG16 <- data.frame(Row = numeric(), Experiment = character(), Setup = character(), Folds_Replica = numeric(), micro.F1 = character(), macro.F1 = character())
r=1
for (exp in levels(as.factor(data.VGG16_1FC$Experiment)))
{
  d <- data.VGG16_1FC[data.VGG16_1FC$Experiment == exp,]
  
  trainSet = (d$DataTraining[1])
  trainPortion = ifelse(d$Portion[1]=="100p", "100", "10")
  trainMask = ifelse(d$Background[1]== "masked", ",+", ",-")
  trainSetting = paste0(trainSet,trainPortion,trainMask)
  
  testSet = (d$DataTest[1])
  testPortion = ifelse(d$Portion[1]=="100p", "100", "10")
  testMask = ifelse(d$Background[1]== "masked", ",+", ",-")
  testSetting = paste0(testSet,testPortion,testMask)
  
  
  expResult <- data.frame(Row = r, Experiment = d$Setup[1], Setup = paste0(trainSetting,"|",testSetting),  micro.F1 = format(round(mean(d$Micro.F1), digits = 2), nsmall = 2), macro.F1 = format(round(mean(d$Macro.F1), digits = 2), nsmall = 2))
  resultsTable.VGG16 <- rbind(resultsTable.VGG16, expResult)
  r = r + 1
}
resultsTable.VGG16


## Table 8: ANOVA results of F1 scores for model “VGG16_1FC” experiments

# beware: results are sorted in another order in the manuscript, flag indicating "background masked" is inverted
# please note: base-line is (portion=="100%, background == "masked", NOT out-of-set")

# anova micro F1
model.micro <- lm(formula = Micro.F1 ~ Portion * Background * OutOfSet, data = data.VGG16_1FC)
print(summary(model.micro))

# anova macro F1
model.macro <- lm(formula = Macro.F1 ~ Portion * Background * OutOfSet, data = data.VGG16_1FC)
print(summary(model.macro))


## Table 9: ANOVA results of F1 scores for model “VGG16_1FC” experiments utilizing only the full data 

# please note: base-line is (background == "masked", NOT out-of-set")

# exclude 10% data as statistically problematic
data.VGG16_1FC.100p <- data.VGG16_1FC[data.VGG16_1FC$Portion=="100p",]

# anova micro F1 w/o 10%
model.micro.100p <- lm(formula = Micro.F1 ~ Background * OutOfSet, data = data.VGG16_1FC.100p)
print(summary(model.micro.100p))

# anova macro F1 w/o 10%
model.macro.100p <- lm(formula = Macro.F1 ~  Background * OutOfSet, data = data.VGG16_1FC.100p)
print(summary(model.macro.100p))


## Table S1: ANOVA of F1 scores for multiple models

# rows "all experiments" see Table 5.

# rows for individual experiments and each CNN architecture:
for (cnn in levels(dataSetup$CNN)) # iterate over all cnn architectures
{
  cat(paste0("Statistics on ", cnn, "\n"))
  data <- dataSetup[dataSetup$CNN == cnn,]
  
  # anova micro F1
  model.micro <- lm(formula = Micro.F1 ~ Portion * Background * OutOfSet, data = data)
  print(summary(model.micro))
  
  # anova  macro F1
  model.micro <- lm(formula = Macro.F1 ~ Portion * Background * OutOfSet, data = data)
  print(summary(model.micro))
  cat("\n\n")
}


### Figures

## Figure 4: Boxplots comparing the classification performance for model “VGG16_1FC” experiments. 
# please note: in the manuscript positions of F1 micro and macro are swapped.

data.F1micro <- data.VGG16_1FC[,c("Setup", "Background", "Micro.F1")]
data.F1micro$F1 <- "F1micro"
colnames(data.F1micro)[3] <- "F1score"

data.F1macro <- data.VGG16_1FC[,c("Setup", "Background", "Macro.F1")]
data.F1macro$F1 <- "F1macro"
colnames(data.F1macro)[3] <- "F1score"

data.combined <- rbind(data.F1micro, data.F1macro)

plotF1Combined <- ggplot(data=data.combined, aes(x=Setup, y=F1score, fill=Background)) +
  geom_boxplot(lwd=0.5) +
  annotate("rect", xmin=-Inf, xmax=8.5, ymin=-Inf, ymax=Inf, fill="#0867c1", alpha=0.2) +
  annotate("rect", xmin=-Inf, xmax=4.5, ymin=-Inf, ymax=Inf, fill="#0867c1", alpha=0.2) +
  annotate("rect",xmin=8.5, xmax=Inf, ymin=-Inf, ymax=Inf, fill="#76cc00", alpha=0.2) +
  annotate("rect",xmin=8.5, xmax=12.5, ymin=-Inf, ymax=Inf, fill="#76cc00", alpha=0.2) +
  theme(axis.text.x = element_text(angle = 90)) +
  geom_boxplot() +
  scale_fill_manual(values=c("#AAAAAA", "#FFFFFF")) +
  theme(legend.position = "none") +
  stat_summary(fun.y=mean, colour="darkred", geom="point", shape=18, size=3, show.legend =  FALSE) +
  facet_wrap(~F1)

print (plotF1Combined)


